# QC-Bench Multilingual Evaluation Script (French and Spanish)
# Files needed to run:
# - qc200_FR.json (French translation of the QC200 benchmark)
# - qc200_SP.json (Spanish translation of the QC200 benchmark)

import os
import re
import time
import json
from collections import defaultdict

import requests
import openai
from anthropic import Anthropic
from google import genai
from google.genai import types

# Set your API keys here
os.environ["OPENAI_API_KEY"] = "Key1"
os.environ["ANTHROPIC_API_KEY"] = "Key1"
os.environ["GROQ_API_KEY"] = "Key1"
os.environ["GOOGLE_API_KEY"] = "Key1"

# Initialize clients
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
client = genai.Client(api_key=GEMINI_API_KEY)
openai.api_key = os.environ["OPENAI_API_KEY"]
anthropic_client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])

# -------------------------------
# USER CONFIGURATION SECTION
# -------------------------------

# Files to benchmark
french_file = "qc200_FR.json"
spanish_file = "qc200_SP.json"

miss_threshold = 12

# -------------------------------
# MODEL CONFIGURATION
# -------------------------------

model_configs = {
    "openai": {"models": {
        "gpt-4-1106-preview": "GPT-4.1",
        "gpt-4-turbo-2024-04-09": "GPT-4.1 mini",
        "gpt-3.5-turbo-0125": "GPT-4.1 nano",
        "gpt-4o": "GPT-4o",
        "gpt-4o-mini-2024-07-18": "GPT-4o-mini",
    }},
    "anthropic": {"models": {
        "claude-3-7-sonnet-20250219": "Claude 3.7 Sonnet",
        "claude-3-5-sonnet-20241022": "Claude 3.5 Sonnet",
        "claude-3-haiku-20240307": "Claude 3.5 Haiku",
    }},
    "groq": {"models": {
        "llama3-70b-8192": "LLaMA 3 70B",
        "llama-3.3-70b-versatile": "LLaMA-3.3-70B-Versatile",
        "gemma2-9b-it": "Gemma 9b",
    }},
    "gemini": {"models": {
        "gemini-1.5-pro-latest": "Gemini 1.5 Pro",
        "gemini-2.0-flash": "Gemini 2.0 Flash",
    }},
}

# This list controls both model inclusion and order
model_run_order = [
    "GPT-4.1",
    "GPT-4.1 mini",
    "GPT-4.1 nano",
    "GPT-4o",
    "GPT-4o-mini",
    "Claude 3.7 Sonnet",
    "Claude 3.5 Sonnet",
    "Claude 3.5 Haiku",
    "LLaMA 3 70B",
    "LLaMA-3.3-70B-Versatile",
    "Gemma 9b",
    "Gemini 1.5 Pro",
    "Gemini 2.0 Flash"
]

# Flatten for fast lookup: label_id -> (provider, model_id)
model_lookup = {
    label_id: (provider, model_id)
    for provider, config in model_configs.items()
    for model_id, label_id in config["models"].items()
}

# -------------------------------
# BENCHMARK EXECUTION
# -------------------------------

def run_multilingual_benchmark(language, filename):
    """Run benchmark on a specific language dataset"""
    all_results = {}
    missed_by = defaultdict(list)

    # Load data
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)

    print(f"\n=== Benchmarking {language.title()} Quantum Computing Questions ===")
    total = len(data)
    results = {}

    for label_id in model_run_order:
        if label_id not in model_lookup:
            continue
        provider, model_id = model_lookup[label_id]
        correct = 0
        for i, q in enumerate(data):
            # Language-specific prompt
            if language.lower() == "french":
                prompt = f"""Tu es un expert en informatique quantique. Choisis la réponse correcte à la question ci-dessous.

Question: {q['question']}
A. {q['A']}
B. {q['B']}
C. {q['C']}
D. {q['D']}

Réponds avec une seule lettre: A, B, C, ou D.

Réponse:"""
            elif language.lower() == "spanish":
                prompt = f"""Eres un experto en computación cuántica. Elige la respuesta correcta a la pregunta a continuación.

Pregunta: {q['question']}
A. {q['A']}
B. {q['B']}
C. {q['C']}
D. {q['D']}

Responde con una sola letra: A, B, C, o D.

Respuesta:"""
            else:
                prompt = f"""You are an expert in quantum computing. Choose the correct answer to the question below.

Question: {q['question']}
A. {q['A']}
B. {q['B']}
C. {q['C']}
D. {q['D']}

Answer with only one letter: A, B, C, or D.

Answer:"""

            try:
                if provider == "openai":
                    resp = openai.ChatCompletion.create(
                        model=model_id,
                        messages=[{"role": "user", "content": prompt}],
                        temperature=0.0,
                        max_tokens=5,
                    )
                    content = resp.choices[0].message.content.strip().upper()
                elif provider == "anthropic":
                    resp = anthropic_client.messages.create(
                        model=model_id,
                        messages=[{"role": "user", "content": prompt}],
                        temperature=0.0,
                        max_tokens=5,
                    )
                    content = resp.content[0].text.strip().upper()
                elif provider == "groq":
                    resp = requests.post(
                        "https://api.groq.com/openai/v1/chat/completions",
                        headers={
                            "Authorization": f"Bearer {os.environ['GROQ_API_KEY']}",
                            "Content-Type": "application/json",
                        },
                        json={
                            "model": model_id,
                            "messages": [{"role": "user", "content": prompt}],
                            "temperature": 0.0,
                            "max_tokens": 5,
                        },
                    ).json()
                    if "choices" not in resp:
                        continue
                    content = resp["choices"][0]["message"]["content"].strip().upper()
                    time.sleep(0.6)
                elif provider == "gemini":
                    resp = client.models.generate_content(
                        model=model_id,
                        contents=prompt,
                        config=types.GenerateContentConfig(
                            temperature=0.0,
                            max_output_tokens=5,
                        ),
                    )
                    content = resp.text.strip().upper()
                else:
                    continue
                pred = re.search(r"[A-D]", content)
                predicted = pred.group(0) if pred else "?"
                if predicted == q["solution"]:
                    correct += 1
                else:
                    missed_by[i].append((label_id, predicted))
            except Exception as e:
                print(f"{provider.title()} error ({label_id}): {e}")
        acc = correct / total
        results[label_id] = acc
        print(f" {label_id} accuracy: {acc:.2%}")

    all_results[language.lower()] = results

    # Print report for this language
    print(f"\n=== FINAL ACCURACY REPORT ({language.upper()}) ===")
    for model in model_run_order:
        if model in results:
            print(f"{model}: {results[model]:.2%}")

    print(f"\n=== Questions Missed by ≥ {miss_threshold} Models ===")
    for q_idx, models_missed in missed_by.items():
        if len(models_missed) >= miss_threshold:
            question_text = data[q_idx]["question"]
            correct_answer = data[q_idx]["solution"]
            print(f"\nQ{q_idx+1} missed by {len(models_missed)} models: {question_text[:100]}...")
            print(f"Correct Answer: {correct_answer}")
            model_list = ", ".join([f"{m}({ans})" for m, ans in models_missed])
            print(f"Missed by: {model_list}")

    return all_results

# Run benchmarks for both languages
french_results = run_multilingual_benchmark("french", french_file)
spanish_results = run_multilingual_benchmark("spanish", spanish_file)

# Combine results
all_results = {}
all_results.update(french_results)
all_results.update(spanish_results)

# Save results to JSON
with open("multilingual_results.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)
